R Markdown

setwd("/Users/kenyaroy/doing data science/unit 8")
Beers = read.csv("/Users/kenyaroy/doing data science/unit 8/Beers.csv")
Breweries = read.csv("/Users/kenyaroy/doing data science/unit 8/Breweries.csv")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(naniar)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(usmap)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   1.0.1
## ✔ tidyr   1.2.1     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
library(class)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
#How many breweries are present in each state
Breweries %>% group_by(State) %>% dplyr::summarize(n()) %>% print(n=51)
## # A tibble: 51 × 2
##    State `n()`
##    <chr> <int>
##  1 " AK"     7
##  2 " AL"     3
##  3 " AR"     2
##  4 " AZ"    11
##  5 " CA"    39
##  6 " CO"    47
##  7 " CT"     8
##  8 " DC"     1
##  9 " DE"     2
## 10 " FL"    15
## 11 " GA"     7
## 12 " HI"     4
## 13 " IA"     5
## 14 " ID"     5
## 15 " IL"    18
## 16 " IN"    22
## 17 " KS"     3
## 18 " KY"     4
## 19 " LA"     5
## 20 " MA"    23
## 21 " MD"     7
## 22 " ME"     9
## 23 " MI"    32
## 24 " MN"    12
## 25 " MO"     9
## 26 " MS"     2
## 27 " MT"     9
## 28 " NC"    19
## 29 " ND"     1
## 30 " NE"     5
## 31 " NH"     3
## 32 " NJ"     3
## 33 " NM"     4
## 34 " NV"     2
## 35 " NY"    16
## 36 " OH"    15
## 37 " OK"     6
## 38 " OR"    29
## 39 " PA"    25
## 40 " RI"     5
## 41 " SC"     4
## 42 " SD"     1
## 43 " TN"     3
## 44 " TX"    28
## 45 " UT"     4
## 46 " VA"    16
## 47 " VT"    10
## 48 " WA"    23
## 49 " WI"    20
## 50 " WV"     1
## 51 " WY"     4
Breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar() + ggtitle("Distribution of Breweries by State") + ylab(" # of Breweries") + geom_text(stat = "count", aes(label = after_stat(count)), vjust = 0) + theme(legend.position = "none")

#Heat Map of breweries in each state.
library(usmap)
StateBeerC = data.frame(state = c("AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"),values = c(7,3,2,11,39,47,8,1,2,15,7,4,5,5,18,22,3,4,5,23,7,9,32,12,9,2,9,19,1,5,3,3,4,2,16,15,6,29,25,5,4,1,3,28,4,16,10,23,20,1,4))

plot_usmap(data = StateBeerC, values = "values", regions = "state") + scale_fill_continuous(low = "yellow", high = "red", name = "Number of Breweries", label = scales::comma) + labs(title = "Number of Breweries By State", ) + theme(legend.position = "right")

#Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the #merged file.
Breweries$Brewery_id = Breweries$Brew_ID
Breweries <- Breweries %>% select(Brewery_id, Name, City, State)
BB <- merge(Beers,Breweries, by = "Brewery_id", all = TRUE)
dim(BB)
## [1] 2410   10
summary(BB)
##    Brewery_id       Name.x             Beer_ID            ABV         
##  Min.   :  1.0   Length:2410        Min.   :   1.0   Min.   :0.00100  
##  1st Qu.: 94.0   Class :character   1st Qu.: 808.2   1st Qu.:0.05000  
##  Median :206.0   Mode  :character   Median :1453.5   Median :0.05600  
##  Mean   :232.7                      Mean   :1431.1   Mean   :0.05977  
##  3rd Qu.:367.0                      3rd Qu.:2075.8   3rd Qu.:0.06700  
##  Max.   :558.0                      Max.   :2692.0   Max.   :0.12800  
##                                                      NA's   :62       
##       IBU            Style               Ounces         Name.y         
##  Min.   :  4.00   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 21.00   Class :character   1st Qu.:12.00   Class :character  
##  Median : 35.00   Mode  :character   Median :12.00   Mode  :character  
##  Mean   : 42.71                      Mean   :13.59                     
##  3rd Qu.: 64.00                      3rd Qu.:16.00                     
##  Max.   :138.00                      Max.   :32.00                     
##  NA's   :1005                                                          
##      City              State          
##  Length:2410        Length:2410       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
head(BB)
##   Brewery_id        Name.x Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie's Leap    2691 0.049  26
## 3          1    Wall's End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces             Name.y        City
## 1                        American IPA     16 NorthGate Brewing  Minneapolis
## 2                  Milk / Sweet Stout     16 NorthGate Brewing  Minneapolis
## 3                   English Brown Ale     16 NorthGate Brewing  Minneapolis
## 4                         Pumpkin Ale     16 NorthGate Brewing  Minneapolis
## 5                     American Porter     16 NorthGate Brewing  Minneapolis
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing  Minneapolis
##   State
## 1    MN
## 2    MN
## 3    MN
## 4    MN
## 5    MN
## 6    MN
tail(BB)
##      Brewery_id                    Name.x Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                        Name.y          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK
#Address the missing values in each column
#https://www.masterclass.com/articles/ibu-beer
#There are styles of beer and each style has a range of IBU, we could use the sytle of the beer to assign an average value 

MeanIBU <- BB %>% filter(!is.na(IBU)) %>% group_by(Style) %>% dplyr::summarize(IBUMean = mean(IBU))
TestBB <- merge(BB,MeanIBU, by="Style")
TestBB$IBU[is.na(TestBB$IBU)] <- TestBB$IBUMean[is.na(TestBB$IBU)]

MeanABV <- BB %>% filter(!is.na(ABV)) %>% group_by(Style) %>% dplyr::summarize(ABVMean=mean(ABV))
TestBB1 <- merge(TestBB,MeanABV, by="Style")
TestBB1$ABV[is.na(TestBB1$ABV)] <- TestBB1$ABVMean[is.na(TestBB1$ABV)]

gg_miss_var(TestBB1) + ggtitle("Missing Values in Dataset")

#In order to find the missing values in ABV and IBU, we calculated the mean of each variable by Style and imputed #them into BB dataframe
#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
TestBB %>%  ggplot(aes(x = State, y = IBU, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median IBU of Breweries by State") + ylab("International Bitterness Unit") + theme(legend.position="none")

#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
TestBB1 %>%  ggplot(aes(x = State, y = ABV, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median ABV of Breweries by State") + ylab("Alcohol By Volume") + theme(legend.position="none")

#Which state has the most bitter (IBU) beer?
TestBB %>% dplyr::summarise(max(IBU))
##   max(IBU)
## 1      138
TestBB %>% filter(IBU == "138")
##                            Style Brewery_id                    Name.x Beer_ID
## 1 American Double / Imperial IPA        375 Bitter Bitch Imperial IPA     980
##     ABV IBU Ounces                  Name.y    City State IBUMean
## 1 0.082 138     12 Astoria Brewing Company Astoria    OR   93.32
p <- BB %>% ggplot(aes(x=IBU, fill=State)) + geom_bar() + ggtitle("Finding State with Maximum IBU") + theme(legend.position="none") + ylab("Beers")
ggplotly(p) 
## Warning: Removed 1005 rows containing non-finite values (`stat_count()`).
#The state with the most bitter (IBU) beer is Oregon.
#Which state has the maximum alcoholic (ABV) beer? 
TestBB1 %>% dplyr::summarise(max(ABV))
##   max(ABV)
## 1    0.128
TestBB1 %>% filter(ABV == "0.128")
##              Style Brewery_id
## 1 Quadrupel (Quad)         52
##                                                 Name.x Beer_ID   ABV IBU Ounces
## 1 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale    2565 0.128  24   19.2
##                    Name.y    City State IBUMean ABVMean
## 1 Upslope Brewing Company Boulder    CO      24   0.104
p <- BB %>% ggplot(aes(x=ABV, fill=State)) + geom_bar() + ggtitle("Finding State with Maximum ABV") + theme(legend.position="none") + ylab("Beers")
ggplotly(p)
## Warning: Removed 62 rows containing non-finite values (`stat_count()`).
#The state with the maximum alcoholic (ABV) beer is Colorado.
#Distribution of ABV versus IBU
summary(TestBB1$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02700 0.05000 0.05600 0.05972 0.06700 0.12800
BB %>% filter(!is.na(ABV) & !is.na(IBU)) %>% select(ABV, IBU) %>% ggpairs()

BB %>% ggplot(aes(x=ABV, y=IBU)) + geom_point() +geom_smooth(position="jitter") + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1005 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1005 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1005 rows containing missing values (`geom_point()`).

TestBB1 %>% select(ABV, IBU) %>% ggpairs()

TestBB1 %>% ggplot(aes(x=ABV, y=IBU)) + geom_point() +geom_smooth(position="jitter") + geom_smooth() + ggtitle("Distribution of ABV versus IBU")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#Distribution of IBU versus ABV by Style (IPA and Ale)
library(dplyr)
IPA <- TestBB1 %>% filter(grepl('IPA', Style)) 
Ale <- TestBB1 %>% filter(grepl('Ale', Style)) 
head(IPA)
##                            Style Brewery_id                     Name.x Beer_ID
## 1 American Double / Imperial IPA        167 Gordon Imperial Red (2010)     806
## 2 American Double / Imperial IPA        215               More Cowbell    2123
## 3 American Double / Imperial IPA        504              Gordon (2005)     805
## 4 American Double / Imperial IPA        167         GUBNA Imperial IPA       6
## 5 American Double / Imperial IPA        375  Bitter Bitch Imperial IPA     980
## 6 American Double / Imperial IPA        175              MechaHopzilla    1114
##     ABV    IBU Ounces                              Name.y        City State
## 1 0.087  85.00     12                 Oskar Blues Brewery    Longmont    CO
## 2 0.090 118.00     16       Buffalo Bayou Brewing Company     Houston    TX
## 3 0.092  85.00     12                 Oskar Blues Brewery       Lyons    CO
## 4 0.099 100.00     12                 Oskar Blues Brewery    Longmont    CO
## 5 0.082 138.00     12             Astoria Brewing Company     Astoria    OR
## 6 0.088  93.32     16 New Orleans Lager & Ale Brewing ... New Orleans    LA
##   IBUMean    ABVMean
## 1   93.32 0.08736893
## 2   93.32 0.08736893
## 3   93.32 0.08736893
## 4   93.32 0.08736893
## 5   93.32 0.08736893
## 6   93.32 0.08736893
head(Ale)
##                      Style Brewery_id                       Name.x Beer_ID
## 1         Abbey Single Ale         58      Abbey's Single (2015- )    2505
## 2         Abbey Single Ale         58 Abbey's Single Ale (Current)    1887
## 3 American Amber / Red Ale        387             Colorado Red Ale     220
## 4 American Amber / Red Ale         61       Reprise Centennial Red    2288
## 5 American Amber / Red Ale        487                  Wisco Disco     953
## 6 American Amber / Red Ale        202                 Loki Red Ale    2191
##     ABV     IBU Ounces                   Name.y        City State IBUMean
## 1 0.049 22.0000     12          Destihl Brewery Bloomington    IL 22.0000
## 2 0.049 22.0000     12          Destihl Brewery Bloomington    IL 22.0000
## 3 0.062 36.2987     12       Revolution Brewing      Paonia    CO 36.2987
## 4 0.060 36.2987     12  4 Hands Brewing Company Saint Louis    MO 36.2987
## 5 0.051 31.0000     16   Stillmank Beer Company   Green Bay    WI 36.2987
## 6 0.075 53.0000     16 Fearless Brewing Company    Estacada    OR 36.2987
##    ABVMean
## 1 0.049000
## 2 0.049000
## 3 0.057456
## 4 0.057456
## 5 0.057456
## 6 0.057456
BBIPAAle <- TestBB1 %>% filter(str_detect(Style, "IPA")|str_detect(Style, " Ale"))
BBIPAAle$AleType <- ifelse(str_detect(BBIPAAle$Style,"IPA"),"IPA","Ale")

BBIPAAle %>% ggplot(aes(x=ABV, y = IBU, color = AleType)) + geom_point() +ggtitle("Distribution of IBU versus ABV by Style (IPA and Ale)")

#Internal classification knn
classifications = knn.cv(BBIPAAle[,c(5,6)],BBIPAAle$AleType, prob = TRUE, k = 5)
  table(classifications,BBIPAAle$AleType)
##                
## classifications Ale IPA
##             Ale 893  64
##             IPA  69 507
  CM = confusionMatrix(table(classifications,BBIPAAle$AleType))
  CM
## Confusion Matrix and Statistics
## 
##                
## classifications Ale IPA
##             Ale 893  64
##             IPA  69 507
##                                          
##                Accuracy : 0.9132         
##                  95% CI : (0.898, 0.9269)
##     No Information Rate : 0.6275         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.8147         
##                                          
##  Mcnemar's Test P-Value : 0.7287         
##                                          
##             Sensitivity : 0.9283         
##             Specificity : 0.8879         
##          Pos Pred Value : 0.9331         
##          Neg Pred Value : 0.8802         
##              Prevalence : 0.6275         
##          Detection Rate : 0.5825         
##    Detection Prevalence : 0.6243         
##       Balanced Accuracy : 0.9081         
##                                          
##        'Positive' Class : Ale            
## 
#Naive Bayes
splitPerc = .7 #Training / Test split Percentage
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
##      
##       Ale IPA
##   Ale 245  25
##   IPA  35 155
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
CM
## Confusion Matrix and Statistics
## 
##      
##       Ale IPA
##   Ale 245  25
##   IPA  35 155
##                                          
##                Accuracy : 0.8696         
##                  95% CI : (0.8353, 0.899)
##     No Information Rate : 0.6087         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.7289         
##                                          
##  Mcnemar's Test P-Value : 0.2453         
##                                          
##             Sensitivity : 0.8750         
##             Specificity : 0.8611         
##          Pos Pred Value : 0.9074         
##          Neg Pred Value : 0.8158         
##              Prevalence : 0.6087         
##          Detection Rate : 0.5326         
##    Detection Prevalence : 0.5870         
##       Balanced Accuracy : 0.8681         
##                                          
##        'Positive' Class : Ale            
## 
#External classification knn (train and test)
splitPerc = .7 #Training / Test split Percentage
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
##      
##       Ale IPA
##   Ale 255  20
##   IPA  41 144
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
CM
## Confusion Matrix and Statistics
## 
##      
##       Ale IPA
##   Ale 255  20
##   IPA  41 144
##                                          
##                Accuracy : 0.8674         
##                  95% CI : (0.8329, 0.897)
##     No Information Rate : 0.6435         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.719          
##                                          
##  Mcnemar's Test P-Value : 0.01045        
##                                          
##             Sensitivity : 0.8615         
##             Specificity : 0.8780         
##          Pos Pred Value : 0.9273         
##          Neg Pred Value : 0.7784         
##              Prevalence : 0.6435         
##          Detection Rate : 0.5543         
##    Detection Prevalence : 0.5978         
##       Balanced Accuracy : 0.8698         
##                                          
##        'Positive' Class : Ale            
## 
#Separate dataset by beer style (IPA, Ale, Lager) and create bar chart.
BBClassify <- TestBB1
BBClassify$BeerType = BBClassify$AleType
BBClassify$BeerType = ifelse(str_detect(BBClassify$Style, "IPA"),"IPA",ifelse(str_detect(BBClassify$Style, "Stout"), "Stout",ifelse(str_detect(BBClassify$Style, "Pilsner"),"Pilsner",ifelse(str_detect(BBClassify$Style, "Beer"),"Beer",ifelse(str_detect(BBClassify$Style, " Ale"),"Ale",ifelse(str_detect(BBClassify$Style, "Lager"),"Lager","Other"))))))
BBClassify %>% filter(BeerType == "IPA" |BeerType == "Ale" |BeerType == "Lager") %>% ggplot(aes(x=IBU, y = ABV, color = BeerType)) + geom_jitter() + ggtitle("Distribution of IBU versus ABV by Style (IPA, Ale, Lager)")

BBClassify %>% ggplot(aes(x=BeerType, fill= BeerType)) + geom_bar() + ggtitle("Distribution of Beers by Beer Type") + ylab("Beers") + xlab("Beer Type")